import numpy as np
import pandas as pd
import json
import math
import os
import matplotlib.pyplot as plt
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
path=os.getcwd()
path
train=pd.read_csv(path +"\\train_finaldata.csv", sep=",", na_values=["?",",","#","NaN","unknown",""])
train.head()
train.shape
train["date"]=pd.to_datetime(train["date"])
with open(path+"\\city_dict.json") as f:
data = json.load(f)
city = {value: key for key, value in data.items()}
city={k:int(v) for k, v in city.items()}
city={value: key for key, value in city.items()}
import plotly.express as px
city_sales=train.groupby(["city"])["sales"].sum().sort_values(ascending=True).reset_index()
city_sales["sales"]=np.round(city_sales["sales"])
city_sales["city"]=city_sales["city"].map(city)
fig5 = px.bar(city_sales, x="city", y="sales", barmode='group',title="Aggregated sales for each City")
fig5.show()
mumbai=train[train["city"]==4]
mumbai.groupby(["medicine"])["sales"].mean().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Mumbai")
plt.ylabel("sales")
plt.figure(figsize=(5,5))
plt.show()
mumbai[mumbai["year"]==2015].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Mumbai in the year 2015")
plt.ylabel("sales")
plt.show()
mumbai[mumbai["year"]==2016].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Mumbai in the year 2016")
plt.ylabel("sales")
plt.show()
mumbai[mumbai["year"]==2017].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Mumbai in the year 2017")
plt.ylabel("sales")
plt.show()
mumbai[mumbai["year"]==2018].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Mumbai in the year 2018")
plt.ylabel("sales")
plt.show()
mumbai.groupby(["year"])["sales"].sum().plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("sales of medicines in Mumbai")
plt.ylabel("sales")
plt.show()
from pandas import DataFrame
import plotly.express as px
train1 = DataFrame({'percentage_of_sales' : np.round((mumbai.groupby( [ "year",'discounted'] ).size()/mumbai.groupby( [ "year",'discounted'] ).size().sum())*100,2)}).reset_index()
fig5 = px.bar(train1, x="year", y='percentage_of_sales', color='discounted',labels='percentage_of_sales',text='percentage_of_sales',title="Contribution of each year sales towards overall sales in Mumbai ")
fig5.show()
delhi=train[train["city"]==5]
delhi[delhi["year"]==2015].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Delhi in the year 2015")
plt.ylabel("sales")
plt.show()
delhi[delhi["year"]==2016].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Delhi in the year 2016")
plt.ylabel("sales")
plt.show()
delhi[delhi["year"]==2017].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Delhi in the year 2017")
plt.ylabel("sales")
plt.show()
delhi[delhi["year"]==2018].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Delhi in the year 2018")
plt.ylabel("sales")
plt.show()
Kolkata=train[train["city"]==1]
Kolkata[Kolkata["year"]==2015].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Kolkata in the year 2015")
plt.ylabel("sales")
plt.show()
Kolkata[Kolkata["year"]==2016].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Kolkata in the year 2016")
plt.ylabel("sales")
plt.show()
Kolkata[Kolkata["year"]==2017].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Kolkata in the year 2017")
plt.ylabel("sales")
plt.show()
Kolkata[Kolkata["year"]==2018].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Kolkata in the year 2018")
plt.ylabel("sales")
plt.show()
Bangalore=train[train["city"]==7]
Bangalore[Bangalore["year"]==2015].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Bangalore in the year 2015")
plt.ylabel("sales")
plt.show()
Bangalore[Bangalore["year"]==2016].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Bangalore in the year 2016")
plt.ylabel("sales")
plt.show()
Bangalore[Bangalore["year"]==2017].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Bangalore in the year 2017")
plt.ylabel("sales")
plt.show()
Bangalore[Bangalore["year"]==2018].groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].plot.bar()
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 sold medicine types in Bangalore in the year 2018")
plt.ylabel("sales")
plt.show()
lt=list(mumbai.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:5].index)
data=[go.Bar(x=list(mumbai[mumbai["medicine"]==col].groupby(["year_month"])["sales"].sum().index),y=list(mumbai[mumbai["medicine"]==col].groupby(["year_month"])["sales"].sum().values),name="medicine_"+str(col)) for col in lt]
layout=go.Layout(title="Montly sales of first 5 medicines in Mumbai",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
m2017=mumbai[mumbai["year"]==2017]
lt=list(m2017.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:5].index)
data=[go.Bar(x=list(m2017[m2017["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(m2017[m2017["medicine"]==col].groupby(["month"])["sales"].sum().values),name="medicine_"+str(col)) for col in lt]
layout=go.Layout(title="Montly sales of first 10 medicines in Mumbai in the year 2017",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
mum_city_sales=mumbai[mumbai["year"]==2017].groupby(["month"])["sales"].sum().sort_values(ascending=True).reset_index()
fig5 = px.bar(mum_city_sales, x="month", y="sales", barmode='group',title="Aggregated monthly sales in Mumbai in the year 2017")
fig5.show()
d2017=delhi[delhi["year"]==2017]#d2017=delhi 2017,ltd=list_delhi
ltd=list(d2017.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:5].index)
data=[go.Bar(x=list(d2017[d2017["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(d2017[d2017["medicine"]==col].groupby(["month"])["sales"].sum().values),name="medicine_"+str(col)) for col in ltd]
layout=go.Layout(title="Montly sales of first 10 medicines in Delhi in the year 2017",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
b2017=Bangalore[Bangalore["year"]==2017]#b2017=Bangalore 2017,ltb=list_bangalore
ltb=list(b2017.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:5].index)
data=[go.Bar(x=list(b2017[b2017["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(b2017[b2017["medicine"]==col].groupby(["month"])["sales"].sum().values),name="medicine_"+str(col)) for col in ltb]
layout=go.Layout(title="Montly sales of first 10 medicines in Bangalore in the year 2017",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
k2017=Kolkata[Kolkata["year"]==2017]#b2017=kolkata 2017,ltb=list_kolkata
ltk=list(k2017.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:5].index)
data=[go.Bar(x=list(k2017[k2017["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(k2017[k2017["medicine"]==col].groupby(["month"])["sales"].sum().values),name="medicine_"+str(col)) for col in ltk]
layout=go.Layout(title="Montly sales of first 10 medicines in kolkata in the year 2017",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
m2016=mumbai[mumbai["year"]==2016]#m2016=Mumbai 2016,ltm6=list_mumbai_2016
ltm6=list(m2016.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:5].index)
data=[go.Bar(x=list(m2016[m2016["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(m2016[m2016["medicine"]==col].groupby(["month"])["sales"].sum().values),name="medicine_"+str(col)) for col in ltm6]
layout=go.Layout(title="Montly sales of first 5 medicines in Mumbai in the year 2016",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
d2016=delhi[delhi["year"]==2016]#d2016=Delhi 2016,ltd6=list_Delhi_2016
ltd6=list(d2016.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:5].index)
data=[go.Bar(x=list(d2016[d2016["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(d2016[d2016["medicine"]==col].groupby(["month"])["sales"].sum().values),name="medicine_"+str(col)) for col in ltd6]
layout=go.Layout(title="Montly sales of first 5 medicines in Delhi in the year 2016",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
k2016=Kolkata[Kolkata["year"]==2016]#d2016=Kolkata 2016,ltd6=list_Kolkata_2016
ltk6=list(k2016.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:5].index)
data=[go.Bar(x=list(k2016[k2016["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(k2016[k2016["medicine"]==col].groupby(["month"])["sales"].sum().values),name="medicine_"+str(col)) for col in ltk6]
layout=go.Layout(title="Montly sales of first 5 medicines in Kolkata in the year 2016",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
year_cumulative = train.groupby(["year"])["sales"].sum()
year_cumulative = year_cumulative.reset_index()
year_cumulative["sales"]=np.round(year_cumulative["sales"])
fig = px.bar(year_cumulative, x="year", y="sales", barmode='group',labels="sales",text="sales",title="Aggregated Yearly sales")
fig.show()
d2015=delhi[delhi["year"]==2015]#d2015=delhi 2015,ltd5=list_Delhi_2015
ltd5=list(d2015.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].index)
data=[go.Scatter(x=list(d2015[d2015["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(d2015[d2015["medicine"]==col].groupby(["month"])["sales"].sum().values),mode="lines+markers",name="medicine_"+str(col)) for col in ltd5]
layout=go.Layout(title="Montly sales of first 10 medicines in Delhi in the year 2015",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
k2015=Kolkata[Kolkata["year"]==2015]#k2015=Kolkata 2015,ltk5=list_Kolkata_2015
ltk5=list(k2015.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].index)
data=[go.Scatter(x=list(k2015[k2015["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(k2015[k2015["medicine"]==col].groupby(["month"])["sales"].sum().values),mode="markers",name="medicine_"+str(col)) for col in ltk5]
layout=go.Layout(title="Montly sales of first 10 medicines in Kolkata in the year 2015",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
b2015=Bangalore[Bangalore["year"]==2015]#b2015=Bangalore 2015,ltb5=list_Bangalore_2015
ltb5=list(b2015.groupby(["medicine"])["sales"].sum().sort_values(ascending=False)[:10].index)
data=[go.Scatter(x=list(b2015[b2015["medicine"]==col].groupby(["month"])["sales"].sum().index),y=list(b2015[b2015["medicine"]==col].groupby(["month"])["sales"].sum().values),mode="lines",name="medicine_"+str(col)) for col in ltb5]
layout=go.Layout(title="Montly sales of first 10 medicines in Bangalore in the year 2015",xaxis={"title":"months"},yaxis=dict(title="sales"))
fig=go.Figure(data=data,layout=layout)
fig.show()
yr_mnt=train.groupby(["year","month"])["sales"].sum().reset_index()
yr_mnt["year-month"]=yr_mnt.apply(lambda row:str(int(row["year"]))+"-"+str(int(row["month"])),axis=1)
fig5 = px.bar(yr_mnt, x="year-month", y="sales", barmode='group',title="Aggregated sales for each month")
fig5.show()
def cty_yr_mnt(df,cols):
df1=pd.DataFrame()
lt=["year","month"]
for col in cols:
df1[col]=df[df["city"]==col].groupby(["year", "month"])["sales"].sum().reset_index()["sales"]
df1.columns=df1.columns.map(city)
df1[lt[0]]=df[df["city"]==1].groupby(["year", "month"])["sales"].sum().reset_index()["year"]
df1[lt[1]]=df[df["city"]==1].groupby(["year", "month"])["sales"].sum().reset_index()["month"]
df1["year-month"]=df1.apply(lambda row:str(int(row[lt[0]]))+"-"+str(int(row[lt[1]])),axis=1)
df1=df1.drop(lt,axis=1)
return df1
lis=[4,5,7,1,6] # list of top 5 cities
cty_yr_mnt=cty_yr_mnt(train,lis)
data = [go.Bar(x= cty_yr_mnt['year-month'],y=cty_yr_mnt[col],name=col) for col in list(cty_yr_mnt.columns[:-1])]
layout=go.Layout(title="aggregated monthly sales from top 5 cities")
fig=go.Figure(data,layout)
fig.show()
dfs=pd.DataFrame(np.ceil(train.groupby(["date"])["footfall"].mean()/250))# dfs=date_footfall_sale_dataframe
dfs["sales"]=np.ceil(train.groupby(["date"])["sales"].mean())
dfs.reset_index()
fig = px.histogram(dfs, x=dfs["sales"],title="Distrbution of sales")
fig.update_layout(xaxis_title="Sales data",yaxis_title="Count of sales")
fig.show()
#dfs=date_footfall_sale_dataframe
fig = px.histogram(dfs, x=dfs["footfall"],title="Distrbution of footfall")
fig.update_layout(xaxis_title="Footfall",yaxis_title="count of footfall")
fig.show()
#dfs=date_footfall_sale_dataframe
fig = px.histogram(dfs, x=np.log1p(dfs["sales"]),title="Distrbution of log transformed sales data")
fig.update_layout(xaxis_title="log transformed sales",yaxis_title="count of log transformed sales")
fig.show()
#dfs=date_footfall_sale_dataframe
fig = px.histogram(dfs, x=np.log1p(dfs["footfall"]),title="Distrbution of log transformed footfall data")
fig.update_layout(xaxis_title="log transformed Footfall",yaxis_title="count of log transformed footfall")
fig.show()
#dfs=date_footfall_sale_dataframe
data = [go.Box(y=np.log1p(dfs["footfall"]),name='Footfall'),
go.Box(y=np.log1p(dfs["sales"]),name='Sales')]
fig=go.Figure(data)
fig.show()
import scipy.stats
import pylab
scipy.stats.probplot(train.groupby(["day","month","year"])["sales"].mean(), plot = pylab)
plt.title("QQ Plot", size = 24)
pylab.show()
import scipy.stats
import pylab
scipy.stats.probplot(np.log1p(train.groupby(["day","month","year"])["sales"].mean()), plot = pylab)
plt.title("QQ Plot", size = 24)
pylab.show()
tr1=train.set_index("date")
tr1['sales'].plot(figsize= (20,8), color = "blue")
plt.show()
daily_aggregated_sales_log=np.log(dfs.sales)# dfs=date_footfall_sale_dataframe
moving_avg=daily_aggregated_sales_log.rolling(window=30).mean()
moving_std=daily_aggregated_sales_log.rolling(window=30).std()
daily_aggregated_sales_log.plot(label='Log Scale')
moving_avg.plot(label='moving_avg')
moving_std.plot(label='moving_std')
plt.legend(loc='best')
plt.show()
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(dfs['sales'].rolling(30, center=True).mean().dropna(), period=30)
plt.rcParams["figure.figsize"] = (20,20)
result.plot()
plt.show()
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dfs.sales, lags=40, ax=ax1) #
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dfs.sales, lags=40, ax=ax2)#
from statsmodels.tsa.stattools import adfuller
def adf_test(timeseries):
#Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
#apply adf test on the series
adf_test(dfs['sales'])
from statsmodels.tsa.stattools import kpss
def kpss_test(timeseries):
#Perform Kwiatkowski-Phillips-Schmidt-Shin test(for trend stationarity):
print ('Results of KPSS Test:')
dftest = kpss(dfs["sales"], "ct")
dfoutput = pd.Series(dftest[0:3], index=['Test Statistic','p-value','#Lags Used'])
for key,value in dftest[3].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
#apply kpss test on the series
kpss_test(dfs['sales'])
from statsmodels.tsa.stattools import adfuller
def adf1_test(timeseries):
#Perform Dickey-Fuller test:
dftest1 = adfuller(timeseries, autolag='AIC')
dfoutput1 = pd.Series(dftest1[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest1[4].items():
dfoutput1['Critical Value (%s)'%key] = value
return dfoutput1[1]
def med_agg(df,meds):
lis=[]
for c in meds:
#print(c)
t=adf1_test(df[df["medicine"]==c].groupby(["date"])["sales"].mean().reset_index()["sales"])
lis.append(t)
df1=pd.DataFrame(lis,meds,columns=["p-values"])
df1.index.name="medicine"
return df1
med_valcnt=pd.DataFrame(train["medicine"].value_counts())
med_valcnt.index.name="medicine"
med_valcnt.columns=["values"]
med_nopval=list(med_valcnt[med_valcnt["values"]<12].index)
len(med_nopval)
med_p=med_agg(train,list(train["medicine"].value_counts().index)[:-82])
med_p[med_p["p-values"]<0.05].shape[0],med_p[med_p["p-values"]>0.05].shape[0]
def med_city(df,cols):
lis=[]
for col in cols:
t=adf1_test(df[df["city_medicine"]==col]["sales"])
lis.append(t)
df1=pd.DataFrame(lis,cols,columns=["p-values"])
df1.index.name="medicine"
return df1
city_med_valcnt=pd.DataFrame(train["city_medicine"].value_counts())
city_med_valcnt.index.name="city_medicine"
city_med_valcnt.columns=["values"]
city_med_nopval=list(city_med_valcnt[city_med_valcnt["values"]<12].index)
len(city_med_nopval)
len(city_med_valcnt.index[:-590]),len(city_med_valcnt.index)
city_med_p=med_city(train,list(city_med_valcnt.index[:-590]))
city_med_p[city_med_p["p-values"]<0.05].shape[0],city_med_p[city_med_p["p-values"]>0.05].shape[0]
test=pd.read_csv(path +"\\test_finaldata.csv", sep=",", na_values=["?",",","#","NaN","unknown",""])
test.dtypes
test_bool=pd.DataFrame(test["city_medicine"].value_counts())
.isin command checks each value in dataframe is in list or not
test_bool["bool"]=test["city_medicine"].value_counts().index.isin(list(city_med_valcnt.index))
test_bool[test_bool["bool"]==False].shape[0]
tmed=pd.DataFrame(test["medicine"].value_counts())
tmed["fbool"]=test["medicine"].value_counts().index.isin(list(train["medicine"].value_counts().index))
tmed[tmed["fbool"]==False].shape[0]
from statsmodels.tsa.arima_model import ARIMA
arima_mod = sm.tsa.ARIMA(dfs.sales, (1,1,1)).fit(disp=0)
print(arima_mod.summary())